import pandas as pd
import numpy as np
customerdata=pd.read_csv("Customer DataSet.csv")
customerdata
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | C10002 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | C10004 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.00 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 |
| 4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | C19186 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.500000 | 6 |
| 8946 | C19187 | 19.183215 | 1.000000 | 300.00 | 0.00 | 300.00 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 275.861322 | NaN | 0.000000 | 6 |
| 8947 | C19188 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.000000 | 0.833333 | 0.000000 | 0.666667 | 0.000000 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.250000 | 6 |
| 8948 | C19189 | 13.457564 | 0.833333 | 0.00 | 0.00 | 0.00 | 36.558778 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 2 | 0 | 500.0 | 52.549959 | 55.755628 | 0.250000 | 6 |
| 8949 | C19190 | 372.708075 | 0.666667 | 1093.25 | 1093.25 | 0.00 | 127.040008 | 0.666667 | 0.666667 | 0.000000 | 0.333333 | 2 | 23 | 1200.0 | 63.165404 | 88.288956 | 0.000000 | 6 |
8950 rows × 18 columns
customerdata.head
<bound method NDFrame.head of CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES \
0 C10001 40.900749 0.818182 95.40 0.00
1 C10002 3202.467416 0.909091 0.00 0.00
2 C10003 2495.148862 1.000000 773.17 773.17
3 C10004 1666.670542 0.636364 1499.00 1499.00
4 C10005 817.714335 1.000000 16.00 16.00
... ... ... ... ... ...
8945 C19186 28.493517 1.000000 291.12 0.00
8946 C19187 19.183215 1.000000 300.00 0.00
8947 C19188 23.398673 0.833333 144.40 0.00
8948 C19189 13.457564 0.833333 0.00 0.00
8949 C19190 372.708075 0.666667 1093.25 1093.25
INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY \
0 95.40 0.000000 0.166667
1 0.00 6442.945483 0.000000
2 0.00 0.000000 1.000000
3 0.00 205.788017 0.083333
4 0.00 0.000000 0.083333
... ... ... ...
8945 291.12 0.000000 1.000000
8946 300.00 0.000000 1.000000
8947 144.40 0.000000 0.833333
8948 0.00 36.558778 0.000000
8949 0.00 127.040008 0.666667
ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY \
0 0.000000 0.083333
1 0.000000 0.000000
2 1.000000 0.000000
3 0.083333 0.000000
4 0.083333 0.000000
... ... ...
8945 0.000000 0.833333
8946 0.000000 0.833333
8947 0.000000 0.666667
8948 0.000000 0.000000
8949 0.666667 0.000000
CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT \
0 0.000000 0 2 1000.0
1 0.250000 4 0 7000.0
2 0.000000 0 12 7500.0
3 0.083333 1 1 7500.0
4 0.000000 0 1 1200.0
... ... ... ... ...
8945 0.000000 0 6 1000.0
8946 0.000000 0 6 1000.0
8947 0.000000 0 5 1000.0
8948 0.166667 2 0 500.0
8949 0.333333 2 23 1200.0
PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
0 201.802084 139.509787 0.000000 12
1 4103.032597 1072.340217 0.222222 12
2 622.066742 627.284787 0.000000 12
3 0.000000 NaN 0.000000 12
4 678.334763 244.791237 0.000000 12
... ... ... ... ...
8945 325.594462 48.886365 0.500000 6
8946 275.861322 NaN 0.000000 6
8947 81.270775 82.418369 0.250000 6
8948 52.549959 55.755628 0.250000 6
8949 63.165404 88.288956 0.000000 6
[8950 rows x 18 columns]>
customerdata.tail
<bound method NDFrame.tail of CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES \
0 C10001 40.900749 0.818182 95.40 0.00
1 C10002 3202.467416 0.909091 0.00 0.00
2 C10003 2495.148862 1.000000 773.17 773.17
3 C10004 1666.670542 0.636364 1499.00 1499.00
4 C10005 817.714335 1.000000 16.00 16.00
... ... ... ... ... ...
8945 C19186 28.493517 1.000000 291.12 0.00
8946 C19187 19.183215 1.000000 300.00 0.00
8947 C19188 23.398673 0.833333 144.40 0.00
8948 C19189 13.457564 0.833333 0.00 0.00
8949 C19190 372.708075 0.666667 1093.25 1093.25
INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY \
0 95.40 0.000000 0.166667
1 0.00 6442.945483 0.000000
2 0.00 0.000000 1.000000
3 0.00 205.788017 0.083333
4 0.00 0.000000 0.083333
... ... ... ...
8945 291.12 0.000000 1.000000
8946 300.00 0.000000 1.000000
8947 144.40 0.000000 0.833333
8948 0.00 36.558778 0.000000
8949 0.00 127.040008 0.666667
ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY \
0 0.000000 0.083333
1 0.000000 0.000000
2 1.000000 0.000000
3 0.083333 0.000000
4 0.083333 0.000000
... ... ...
8945 0.000000 0.833333
8946 0.000000 0.833333
8947 0.000000 0.666667
8948 0.000000 0.000000
8949 0.666667 0.000000
CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT \
0 0.000000 0 2 1000.0
1 0.250000 4 0 7000.0
2 0.000000 0 12 7500.0
3 0.083333 1 1 7500.0
4 0.000000 0 1 1200.0
... ... ... ... ...
8945 0.000000 0 6 1000.0
8946 0.000000 0 6 1000.0
8947 0.000000 0 5 1000.0
8948 0.166667 2 0 500.0
8949 0.333333 2 23 1200.0
PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
0 201.802084 139.509787 0.000000 12
1 4103.032597 1072.340217 0.222222 12
2 622.066742 627.284787 0.000000 12
3 0.000000 NaN 0.000000 12
4 678.334763 244.791237 0.000000 12
... ... ... ... ...
8945 325.594462 48.886365 0.500000 6
8946 275.861322 NaN 0.000000 6
8947 81.270775 82.418369 0.250000 6
8948 52.549959 55.755628 0.250000 6
8949 63.165404 88.288956 0.000000 6
[8950 rows x 18 columns]>
customerdata.info
<bound method DataFrame.info of CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES \
0 C10001 40.900749 0.818182 95.40 0.00
1 C10002 3202.467416 0.909091 0.00 0.00
2 C10003 2495.148862 1.000000 773.17 773.17
3 C10004 1666.670542 0.636364 1499.00 1499.00
4 C10005 817.714335 1.000000 16.00 16.00
... ... ... ... ... ...
8945 C19186 28.493517 1.000000 291.12 0.00
8946 C19187 19.183215 1.000000 300.00 0.00
8947 C19188 23.398673 0.833333 144.40 0.00
8948 C19189 13.457564 0.833333 0.00 0.00
8949 C19190 372.708075 0.666667 1093.25 1093.25
INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY \
0 95.40 0.000000 0.166667
1 0.00 6442.945483 0.000000
2 0.00 0.000000 1.000000
3 0.00 205.788017 0.083333
4 0.00 0.000000 0.083333
... ... ... ...
8945 291.12 0.000000 1.000000
8946 300.00 0.000000 1.000000
8947 144.40 0.000000 0.833333
8948 0.00 36.558778 0.000000
8949 0.00 127.040008 0.666667
ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY \
0 0.000000 0.083333
1 0.000000 0.000000
2 1.000000 0.000000
3 0.083333 0.000000
4 0.083333 0.000000
... ... ...
8945 0.000000 0.833333
8946 0.000000 0.833333
8947 0.000000 0.666667
8948 0.000000 0.000000
8949 0.666667 0.000000
CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT \
0 0.000000 0 2 1000.0
1 0.250000 4 0 7000.0
2 0.000000 0 12 7500.0
3 0.083333 1 1 7500.0
4 0.000000 0 1 1200.0
... ... ... ... ...
8945 0.000000 0 6 1000.0
8946 0.000000 0 6 1000.0
8947 0.000000 0 5 1000.0
8948 0.166667 2 0 500.0
8949 0.333333 2 23 1200.0
PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
0 201.802084 139.509787 0.000000 12
1 4103.032597 1072.340217 0.222222 12
2 622.066742 627.284787 0.000000 12
3 0.000000 NaN 0.000000 12
4 678.334763 244.791237 0.000000 12
... ... ... ... ...
8945 325.594462 48.886365 0.500000 6
8946 275.861322 NaN 0.000000 6
8947 81.270775 82.418369 0.250000 6
8948 52.549959 55.755628 0.250000 6
8949 63.165404 88.288956 0.000000 6
[8950 rows x 18 columns]>
customerdata.describe
<bound method NDFrame.describe of CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES \
0 C10001 40.900749 0.818182 95.40 0.00
1 C10002 3202.467416 0.909091 0.00 0.00
2 C10003 2495.148862 1.000000 773.17 773.17
3 C10004 1666.670542 0.636364 1499.00 1499.00
4 C10005 817.714335 1.000000 16.00 16.00
... ... ... ... ... ...
8945 C19186 28.493517 1.000000 291.12 0.00
8946 C19187 19.183215 1.000000 300.00 0.00
8947 C19188 23.398673 0.833333 144.40 0.00
8948 C19189 13.457564 0.833333 0.00 0.00
8949 C19190 372.708075 0.666667 1093.25 1093.25
INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY \
0 95.40 0.000000 0.166667
1 0.00 6442.945483 0.000000
2 0.00 0.000000 1.000000
3 0.00 205.788017 0.083333
4 0.00 0.000000 0.083333
... ... ... ...
8945 291.12 0.000000 1.000000
8946 300.00 0.000000 1.000000
8947 144.40 0.000000 0.833333
8948 0.00 36.558778 0.000000
8949 0.00 127.040008 0.666667
ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY \
0 0.000000 0.083333
1 0.000000 0.000000
2 1.000000 0.000000
3 0.083333 0.000000
4 0.083333 0.000000
... ... ...
8945 0.000000 0.833333
8946 0.000000 0.833333
8947 0.000000 0.666667
8948 0.000000 0.000000
8949 0.666667 0.000000
CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT \
0 0.000000 0 2 1000.0
1 0.250000 4 0 7000.0
2 0.000000 0 12 7500.0
3 0.083333 1 1 7500.0
4 0.000000 0 1 1200.0
... ... ... ... ...
8945 0.000000 0 6 1000.0
8946 0.000000 0 6 1000.0
8947 0.000000 0 5 1000.0
8948 0.166667 2 0 500.0
8949 0.333333 2 23 1200.0
PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
0 201.802084 139.509787 0.000000 12
1 4103.032597 1072.340217 0.222222 12
2 622.066742 627.284787 0.000000 12
3 0.000000 NaN 0.000000 12
4 678.334763 244.791237 0.000000 12
... ... ... ... ...
8945 325.594462 48.886365 0.500000 6
8946 275.861322 NaN 0.000000 6
8947 81.270775 82.418369 0.250000 6
8948 52.549959 55.755628 0.250000 6
8949 63.165404 88.288956 0.000000 6
[8950 rows x 18 columns]>
customerdata.shape
(8950, 18)
customerdata.size
161100
customerdata.columns
Index(['CUST_ID', 'BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE'],
dtype='object')
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
# Define K-means model
kmeans_model = KMeans(init='k-means++', max_iter=400, random_state=42)
kmeans_model
KMeans(max_iter=400, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(max_iter=400, random_state=42)
# Train the model
kmeans_model.fit(customerdata[['PURCHASES','ONEOFF_PURCHASES']])
KMeans(max_iter=400, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(max_iter=400, random_state=42)
kmeans_model
KMeans(max_iter=400, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(max_iter=400, random_state=42)
# Finding the optimal number of clusters
# Create the K means model for different values of K
def try_different_clusters(K, data):
cluster_values = list(range(1, K+1))
inertias=[]
for c in cluster_values:
model = KMeans(n_clusters = c,init='k-means++',max_iter=400,random_state=42)
model.fit(data)
inertias.append(model.inertia_)
return inertias
# Find output for k values between 1 to 12
outputs = try_different_clusters(12, customerdata[['PURCHASES','INSTALLMENTS_PURCHASES']])
distances = pd.DataFrame({"clusters": list(range(1, 13)),"sum of squared distances": outputs})
outputs
[48172785742.51006, 25570094297.549667, 14135868527.756325, 9419113566.979755, 7301092843.49474, 6351317198.853046, 5610332091.39718, 5025671204.126617, 4585178348.975172, 3753994391.8966494, 3480186022.077272, 3330756578.154151]
distances
| clusters | sum of squared distances | |
|---|---|---|
| 0 | 1 | 4.817279e+10 |
| 1 | 2 | 2.557009e+10 |
| 2 | 3 | 1.413587e+10 |
| 3 | 4 | 9.419114e+09 |
| 4 | 5 | 7.301093e+09 |
| 5 | 6 | 6.351317e+09 |
| 6 | 7 | 5.610332e+09 |
| 7 | 8 | 5.025671e+09 |
| 8 | 9 | 4.585178e+09 |
| 9 | 10 | 3.753994e+09 |
| 10 | 11 | 3.480186e+09 |
| 11 | 12 | 3.330757e+09 |
# Finding optimal number of clusters k
figure = go.Figure()
figure.add_trace(go.Scatter(x=distances["clusters"], y=distances["sum of squared distances"]))
figure.update_layout(xaxis = dict(tick0 = 1,dtick = 1,tickmode = 'linear'),
xaxis_title="Number of clusters",
yaxis_title="Sum of squared distances",
title_text="Finding optimal number of clusters using elbow method")
figure.show()
# Re-Train K means model with k=5
kmeans_model_new = KMeans(n_clusters = 5,init='k-means++',max_iter=400,random_state=42)
kmeans_model_new.fit_predict(customerdata[['PURCHASES','INSTALLMENTS_PURCHASES']])
array([1, 1, 1, ..., 1, 1, 1])
# Visualizing customer segments
# Create data arrays
cluster_centers = kmeans_model_new.cluster_centers_
data = np.expm1(cluster_centers)
points = np.append(data, cluster_centers, axis=1)
points
C:\Users\Amrendra Mishra\AppData\Local\Temp\ipykernel_31140\2537772170.py:5: RuntimeWarning: overflow encountered in expm1
array([[ inf, inf, 4.52049403e+003,
1.63593804e+003],
[1.48411070e+117, 6.96231243e+055, 2.69797272e+002,
1.28582692e+002],
[ inf, inf, 2.86012077e+004,
6.55843227e+003],
[ inf, inf, 1.01930530e+004,
3.95564688e+003],
[ inf, inf, 1.77294678e+003,
8.04046557e+002]])
# Add "clusters" to customers data
points = np.append(points, [[0], [1], [2], [3], [4]], axis=1)
customerdata["clusters"] = kmeans_model_new.labels_
customerdata
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | clusters | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 | 1 |
| 1 | C10002 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 | 1 |
| 2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 | 1 |
| 3 | C10004 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.00 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 | 4 |
| 4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | C19186 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.500000 | 6 | 1 |
| 8946 | C19187 | 19.183215 | 1.000000 | 300.00 | 0.00 | 300.00 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 275.861322 | NaN | 0.000000 | 6 | 1 |
| 8947 | C19188 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.000000 | 0.833333 | 0.000000 | 0.666667 | 0.000000 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.250000 | 6 | 1 |
| 8948 | C19189 | 13.457564 | 0.833333 | 0.00 | 0.00 | 0.00 | 36.558778 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 2 | 0 | 500.0 | 52.549959 | 55.755628 | 0.250000 | 6 | 1 |
| 8949 | C19190 | 372.708075 | 0.666667 | 1093.25 | 1093.25 | 0.00 | 127.040008 | 0.666667 | 0.666667 | 0.000000 | 0.333333 | 2 | 23 | 1200.0 | 63.165404 | 88.288956 | 0.000000 | 6 | 1 |
8950 rows × 19 columns
# visualize clusters
figure = px.scatter_3d(customerdata,
color='clusters',
x="PURCHASES",
y="ONEOFF_PURCHASES",
z="CASH_ADVANCE",
category_orders = {"clusters": ["0", "1", "2", "3", "4"]}
)
figure.update_layout()
figure.show()